0: Preparation

Defining the input and output files

# Clean the working environment
rm(list = ls())

knitr::opts_chunk$set(echo = TRUE)

# empirical_species <- "German Shepherd"

empirical_dog_breed <- Sys.getenv("empirical_dog_breed")

empirical_species <- paste(toupper(substring(unlist(strsplit(empirical_dog_breed, "_")), 1, 1)),
                             substring(unlist(strsplit(empirical_dog_breed, "_")), 2),
                             sep = "", collapse = " ")
# empirical_species <- "Labrador Retriever"
# document_title <- paste(empirical_species,"Pipeline Results - 50 N_e bottleneck scenario")

# document_title <- paste(empirical_species,"55 Gen Breed Formation - SNP chip: Post_bottleneck_generation - Mutations introduced - 50 N_e bottleneck scenario")

# document_title <- paste(empirical_species,"40 Gen Breed Formation - SNP chip: Post_bottleneck_generation - Mutations introduced - 50 N_e bottleneck scenario")

N_e_bottleneck <- as.numeric(Sys.getenv("N_e_bottleneck"))
n_simulated_generations_breed_formation <- as.numeric(Sys.getenv("n_simulated_generations_breed_formation"))
reference_population_for_snp_chip <- Sys.getenv("reference_population_for_snp_chip")

document_title <- paste0(empirical_species," ",n_simulated_generations_breed_formation," Gen Breed Formation - SNP chip: ",reference_population_for_snp_chip," - ",N_e_bottleneck," N_e bottleneck scenario")

document_sub <- Sys.getenv("document_sub_title") 


# # 
# # MAF_pruning_used <- TRUE
# MAF_pruning_used <- FALSE
# 
# N_e <- 340
# # document_sub <- paste("MAF 0.05 used, but only for H_E-computation, N_e=", N_e)
# # document_sub <- paste("MAF 0.01 used, but only for H_E-computation, N_e=", N_e)
# 
# 
# 
# 
# 
# if (MAF_pruning_used == FALSE) {
#   document_sub <- paste("No MAF-based pruning used, N_e =", N_e)
# 
# 
# } else {
#   document_sub <- paste("MAF-based pruning used, N_e =", N_e)
# }

############################################ 
# Parameters used for displaying the result
############################################ 
ROH_frequency_decimals <- 1
H_e_values_decimals <- 5
F_ROH_values_decimals <- 3

####################################  
# Defining the input files
#################################### 

Selection_strength_test_dir <- Sys.getenv("Selection_strength_test_dir")

Sweep_test_dir <-  Sys.getenv("Sweep_test_dir")

############### 
## Empirical ###
###############

### ROH hotspots ###
Empirical_data_ROH_hotspots_dir  <- Sys.getenv("Empirical_data_ROH_hotspots_dir")
Empirical_data_autosome_ROH_freq_dir <- Sys.getenv("Empirical_data_autosome_ROH_freq_dir")
### Inbreeding coefficient ###

Empirical_data_F_ROH_dir  <- Sys.getenv("Empirical_data_F_ROH_dir")

### Expected Heterozygosity distribution ###
Empirical_data_H_e_dir <- Sys.getenv("Empirical_data_H_e_dir")

############### 
## Simulated ###
###############

### Causative variant ###
variant_freq_plots_dir <- Sys.getenv("variant_freq_plots_dir")
variant_position_dir <- Sys.getenv("variant_position_dir")
pruned_replicates_count_dir <- Sys.getenv("pruned_replicates_count_dir")

Selection_causative_variant_windows_dir <- Sys.getenv("Selection_causative_variant_windows_dir")
causative_variant_windows_H_e_dir <- Sys.getenv("causative_variant_windows_H_e_dir")

### ROH hotspots ###
Neutral_model_ROH_hotspots_dir <- Sys.getenv("Neutral_model_ROH_hotspots_dir")
Neutral_model_autosome_ROH_freq_dir <- Sys.getenv("Neutral_model_autosome_ROH_freq_dir")

Selection_model_ROH_hotspots_dir  <- Sys.getenv("Selection_model_ROH_hotspots_dir")
Selection_model_autosome_ROH_freq_dir <- Sys.getenv("Selection_model_autosome_ROH_freq_dir")

### Inbreeding coefficient ###
Neutral_model_F_ROH_dir  <- Sys.getenv("Neutral_model_F_ROH_dir")
Selection_model_F_ROH_dir  <- Sys.getenv("Selection_model_F_ROH_dir")

### Expected Heterozygosity distribution ###
Neutral_model_H_e_dir <- Sys.getenv("Neutral_model_H_e_dir")
Selection_model_H_e_dir <- Sys.getenv("Selection_model_H_e_dir")


histogram_line_sizes <- 3
empirical_data_color <- "darkgreen"
neutral_model_color <- "blue" 
selection_model_color <- "purple"

output_dir <- Sys.getenv("Pipeline_results_output_dir") 


# Sys.getenv()  

# # Set the working directory for notebook chunks
# knitr::opts_knit$set(root.dir = output_dir_sweep_test)

Loading libraries

library(knitr)
## Warning: package 'knitr' was built under R version 4.3.2
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
# Install and load the scatterplot3d package
# install.packages("scatterplot3d")
library(scatterplot3d)

Causative variant

Variant fixation

Fixation probability

Fixation time

# Function to determine the number of rows until fixation is reached
time_to_fixation <- function(data, threshold = 0.9) {
  # Find the index of the first row where the allele frequency is 0.9 or higher
  fixation_index <- which(data$allele_frequency >= threshold)[1]
  
  # Return the number of rows until fixation is reached
  return(fixation_index - 1)
}

Summary - Variant fixation

Selection_coefficient Fixation_probability Avg_Fixation_time Min_fixation_time Max_fixation_time
s=0.4 37.7 23.80 20 29
s=0.5 54.1 18.05 14 24
s=0.6 37.0 15.00 11 20
s=0.7 57.1 10.55 9 13
s=0.8 50.0 8.75 7 10

Causative variant windows

Variant position

Window lengths

Causative variant windows

Summary - Causative variant windows

Selection_coefficient Avg_Length_Mb Avg_window_freq Min_freq Max_freq Avg_freq_variant_100k_window
1 s=0.4 2.135 91.0 48.5 100 92.6
2 s=0.5 2.355 86.9 43.0 100 88.6
4 s=0.7 2.520 85.3 37.5 100 86.7
3 s=0.6 2.810 91.3 47.0 100 93.2
5 s=0.8 3.960 88.3 42.5 100 90.2

Standard Error Confidence interval function

Confidence level <=> konfidensgrad

# Function to calculate standard error and its confidence interval
standard_error_confidence_interval_fun <- function(observed_data, confidence_level = 0.95) {
  # if ( nrow(observed_data) > 1 ) {
  
  if ( length(observed_data) > 1 ) {
      
  # Calculate standard error
  n <- length(observed_data)
  standard_deviation <- sd(observed_data)
  standard_error <- standard_deviation / sqrt(n - 1)
  
  # Calculate confidence interval based on standard error

  alpha <- (1 - confidence_level) / 2
  margin_of_error <- qnorm(1 - alpha) * standard_error
  mean_estimate <- mean(observed_data)
  # Calculate the percentiles for the confidence interval
  confidence_interval_lower_bound <- mean_estimate - margin_of_error # 2.5th percentile (2σ)
  confidence_interval_upper_bound <- mean_estimate + margin_of_error # 97.5th percentile (2σ)
  
  # Return confidence interval
  return(c(confidence_interval_lower_bound, confidence_interval_upper_bound))

    
  } else {
    return(c(NA,NA)) 
    }
  
  
} 





# # Function to calculate bootstrap confidence intervals
# standard_error_confidence_interval_fun <- function(observed_data, n_bootstraps = 1000, confidence_level = 0.95) {
#   
#   # Function to calculate the mean for each bootstrap sample
#   compute_bootstrap_mean_fun <- function(observed_data_urn) {
#     bootstrap_dataset <- sample(observed_data_urn, replace = TRUE)
#     bootstrap_estimate <- mean(bootstrap_dataset)
#     return(bootstrap_estimate)
#   }
#   
#   # Perform bootstrap resampling
#   bootstrap_point_estimates <- replicate(n_bootstraps, compute_bootstrap_mean_fun(observed_data))
#   
#   # Calculate the percentiles for the confidence interval
#   alpha <- (1 - confidence_level) / 2
#   confidence_interval_lower_bound <- quantile(bootstrap_point_estimates, alpha) # 2.5th percentile
#   confidence_interval_upper_bound <- quantile(bootstrap_point_estimates, 1 - alpha) # 97.5th percentile
#   
#   # Return the confidence interval
#   return(c(confidence_interval_lower_bound, confidence_interval_upper_bound))
# }

1: ROH-Frequency

1.1 : Autosome ROH-frequencies

1.1.1 : Empirical - ROH frequency

1.1.2: Neutral Model - ROH frequency

1.1.3: Selection Model

1.1.4 Summary - ROH-hotspot threshold

## ROH-hotspot selection testing results://n
Model Avg_ROH_hotspot_threshold
Neutral 30.0
Empirical 55.4
s=0.5 87.9
s=0.7 88.1
s=0.8 90.9
s=0.4 91.7
s=0.6 92.5

1.2 ROH-hotspots - ROH Frequency and Length

2: Inbreeding coefficient

2.1 Empirical data

## Overall Average Avg_F_ROH for  labrador_retriever : 0.2333818 //n

2.2 Neutral Model

## Point estimate F_ROH across all 20 simulations: 0.1549667 //n
## [1] "Bootstrap 95% Confidence Interval: [0.150300118218385, 0.159633346781615]"

2.3 Selection Model

## Point estimate F_ROH across all 20 simulations for  selection_model_s04_chr3 : 0.2243443 //n[1] "Bootstrap 95% Confidence Interval: [0.210686975401988, 0.238001624598012]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s05_chr3 : 0.2382117 //n[1] "Bootstrap 95% Confidence Interval: [0.226464284699583, 0.249959090300417]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s06_chr3 : 0.2640368 //n[1] "Bootstrap 95% Confidence Interval: [0.251290182073292, 0.276783337926708]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s07_chr3 : 0.2835071 //n[1] "Bootstrap 95% Confidence Interval: [0.268001015456777, 0.299013099543223]"

## Point estimate F_ROH across all 20 simulations for  selection_model_s08_chr3 : 0.3108233 //n[1] "Bootstrap 95% Confidence Interval: [0.283698645307223, 0.337947864692777]"

2.4 F_ROH summary

Model F_ROH Lower_CI Upper_CI
Neutral 0.155 0.150 0.160
s=0.4 0.224 0.211 0.238
Empirical 0.233 NA NA
s=0.5 0.238 0.226 0.250
s=0.6 0.264 0.251 0.277
s=0.7 0.284 0.268 0.299
s=0.8 0.311 0.284 0.338

2.4.1 Visualizing F_ROH

## Models where empirical F_ROH is within CI:
## [1] "s=0.4" "s=0.5"
## 
## Models where empirical F_ROH is outside CI:
## [1] "s=0.6"   "s=0.7"   "s=0.8"   "Neutral"

3: Expected Heterozygosity

3.1 Empirical data

3.2 Neutral Model

3.3 Selection Model

## Uncommented because change of analysis

3.4 Causative Variant Window

## Point estimate H_e across all 20 simulations for  s04_chr3 : 0.1737277 //n[1] "Bootstrap 95% Confidence Interval: [0.117163054312985, 0.230292329452555]"
## Point estimate H_e across all 20 simulations for  s05_chr3 : 0.1908289 //n[1] "Bootstrap 95% Confidence Interval: [0.136513347215409, 0.245144522039319]"
## Point estimate H_e across all 20 simulations for  s06_chr3 : 0.1993809 //n[1] "Bootstrap 95% Confidence Interval: [0.139775338197186, 0.258986412004102]"
## Point estimate H_e across all 20 simulations for  s07_chr3 : 0.2821872 //n[1] "Bootstrap 95% Confidence Interval: [0.202023789078136, 0.362350583744662]"
## Point estimate H_e across all 20 simulations for  s08_chr3 : 0.2492451 //n[1] "Bootstrap 95% Confidence Interval: [0.174049614931524, 0.324440559445571]"

3.4 Genomewide 5th percentile comparison - Expected Heterozygosity Summary

Model H_e_5th_percentile Lower_CI Upper_CI
s08_chr3 0.15588 0.14783 0.16394
s07_chr3 0.15987 0.15078 0.16896
s06_chr3 0.16514 0.15942 0.17086
s05_chr3 0.16670 0.16191 0.17149
s04_chr3 0.16834 0.16257 0.17412
Neutral 0.18256 0.17907 0.18606
Empirical 0.20852 NA NA

4: Summary

4.0 General comparison

4.0.1 ROH-hotspot threshold comparison

## 
##  ROH-hotspot threshold comparison between the different datasets
Model Avg_ROH_hotspot_threshold
Neutral 30.0
Empirical 55.4
s=0.5 87.9
s=0.7 88.1
s=0.8 90.9
s=0.4 91.7
s=0.6 92.5

4.0.2 F_ROH comparison

Model F_ROH Lower_CI Upper_CI
Neutral 0.155 0.150 0.160
s=0.4 0.224 0.211 0.238
Empirical 0.233 NA NA
s=0.5 0.238 0.226 0.250
s=0.6 0.264 0.251 0.277
s=0.7 0.284 0.268 0.299
s=0.8 0.311 0.284 0.338

4.1: Hotspot comparison

4.1.1: Selection test (Sweep test)

## [1] "Selection test results"
## [1] "ROH-hotspot windows with an mean H_e Value lower or equal to the lower confidence interval of the fifth percentile of the neutral model are classified as being under selection"
## [1] "5th percentile of the neutral model is: 0.1790696"
Name Under_selection Window_based_Average_H_e
Hotspot_chr1_window_1 Yes 0.12432
Hotspot_chr28_window_1 No 0.19687
Hotspot_chr24_window_1 No 0.21723
Hotspot_chr13_window_1 No 0.22478
Hotspot_chr6_window_1 No 0.26295
Hotspot_chr15_window_1 No 0.26816
Hotspot_chr11_window_2 No 0.28387
Hotspot_chr11_window_1 No 0.28931
Hotspot_chr8_window_1 No 0.36946
## [1] "ROH-hotspots under selection:"
Name Under_selection Window_based_Average_H_e
Hotspot_chr1_window_1 Yes 0.12432

4.1.2: Selection Strength Test (Sweep test)

4.1.1.1 Extracting Causative windows under selection

Model H_e Lower_CI Upper_CI Under_Selection
s=0.4 0.17373 0.11716 0.23029 Yes
Neutral 0.18256 0.17907 0.18606 No
s=0.5 0.19083 0.13651 0.24514 No
s=0.6 0.19938 0.13978 0.25899 No
s=0.8 0.24925 0.17405 0.32444 No
s=0.7 0.28219 0.20202 0.36235 No

4.1.1.1 Extracting Hotspots under selection

*** Behöver bootstrap av 5th percentiles från neutral model

Hotspot - Causative Window - Comparison

Model Lengths_Mb ROH_Freq H_e Under_selection
s=0.8 3.960 88.3 0.24925 No
s=0.6 2.810 91.3 0.19938 No
s=0.7 2.520 85.3 0.28219 No
s=0.5 2.355 86.9 0.19083 No
s=0.4 2.135 91.0 0.17373 Yes
Hotspot_chr1_window_1 0.200 61.9 0.12432 Yes
# plot_title <- paste("Length And Average ROH % Comparison - ROH Hotspots & Causative Windows")
plot_title <- paste("ROH Hotspot(s) & Causative Windows comparison")


# Modify the labels by removing "Hotspot_" prefix and "_window" suffix from hotspot models
Hotspots_and_Causative_windows_comparison_sorted$Label <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  gsub("Hotspot_|_window", "", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  Hotspots_and_Causative_windows_comparison_sorted$Model
)

# Create a new column for identifying the type
Hotspots_and_Causative_windows_comparison_sorted$Type <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  "Hotspot", 
  "Selection Coefficient"
)


# # Create the 3D scatter plot
# s3d <- scatterplot3d(
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e,
#   color = ifelse(Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", "red", "blue"),
#   pch = 19, # Solid circle
#   xlab = "Lengths",
#   ylab = "ROH Frequency",
#   zlab = "H_e",
#   main = plot_title
# )
# 
# # Add labels to the points
# s3d.coords <- s3d$xyz.convert(
#   Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
#   Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
#   Hotspots_and_Causative_windows_comparison_sorted$H_e
# )
# text(s3d.coords$x, s3d.coords$y, 
#      labels = Hotspots_and_Causative_windows_comparison_sorted$Label, 
#      pos = 3, cex = 0.5) # pos=3 means above


# Create the 3D scatter plot
s3d <- scatterplot3d(
  Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
  Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
  Hotspots_and_Causative_windows_comparison_sorted$H_e,
  color = ifelse(Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", "red", "blue"),
  pch = 19, # Solid circle
  xlab = "Avg ROH-frequency (%)",
  ylab = "Length (Mb)",
  zlab = "H_e",
  main = plot_title
)

# Add labels to the points
s3d.coords <- s3d$xyz.convert(
  Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq,
  Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb,
  Hotspots_and_Causative_windows_comparison_sorted$H_e
)
text(s3d.coords$x, s3d.coords$y, 
     labels = Hotspots_and_Causative_windows_comparison_sorted$Label, 
     pos = 3, cex = 0.5) # pos=3 means above

Visualizing and scaling

# Min-max-scaling normalization function, that normalizes a vector of values
min_max_normalization_fun <- function(x) {
  return((x - min(x)) / (max(x) - min(x)))
}
### Min max scaling ###
# Normalize Lengths_Mb
Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb_Normalized <- min_max_normalization_fun(Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb)
# Normalize ROH Frequency
Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq_Normalized <- min_max_normalization_fun(Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq)
# Normalize H_e
Hotspots_and_Causative_windows_comparison_sorted$H_e_Normalized <- min_max_normalization_fun(Hotspots_and_Causative_windows_comparison_sorted$H_e)

# Modify the labels by removing "Hotspot_" prefix and "_window" suffix from hotspot models
Hotspots_and_Causative_windows_comparison_sorted$Label <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  gsub("Hotspot_|_window", "", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  Hotspots_and_Causative_windows_comparison_sorted$Model
)

# Create a new column for identifying the type
Hotspots_and_Causative_windows_comparison_sorted$Type <- ifelse(
  grepl("Hotspot", Hotspots_and_Causative_windows_comparison_sorted$Model), 
  "Hotspot", 
  "Selection Coefficient"
)

plot_title <- paste("Normalized ROH Hotspot(s) & Causative Windows comparison")


# Create the 3D scatter plot
s3d <- scatterplot3d(
  Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb_Normalized,
  Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq_Normalized,
  Hotspots_and_Causative_windows_comparison_sorted$H_e_Normalized,
  color = ifelse(Hotspots_and_Causative_windows_comparison_sorted$Type == "Hotspot", "red", "blue"),
  pch = 19, # Solid circle
  xlab = "Normalized Lengths",
  ylab = "Normalized Mean ROH Frequency",
  zlab = "Normalized H_e",
  main = plot_title
)

# Add labels to the points
s3d.coords <- s3d$xyz.convert(
  Hotspots_and_Causative_windows_comparison_sorted$Lengths_Mb_Normalized,
  Hotspots_and_Causative_windows_comparison_sorted$ROH_Freq_Normalized,
  Hotspots_and_Causative_windows_comparison_sorted$H_e_Normalized
)
text(s3d.coords$x, s3d.coords$y, 
     labels = Hotspots_and_Causative_windows_comparison_sorted$Label, 
     pos = 3, cex = 0.5) # pos=3 means above

5 Visualizing Expected Heterozygoisty

5.1 Bootstrap CI for mean 5th percentile H_e

## TRUE

## Models where empirical H_e is within CI for Hotspot_chr1_window_1 :
## [1] "s=0.4"
## 
## Models where empirical H_e is outside CI for Hotspot_chr1_window_1 :
## character(0)
## TRUE

## Models where empirical H_e is within CI for Hotspot_chr28_window_1 :
## [1] "s=0.4"
## 
## Models where empirical H_e is outside CI for Hotspot_chr28_window_1 :
## character(0)
## TRUE

## Models where empirical H_e is within CI for Hotspot_chr24_window_1 :
## [1] "s=0.4"
## 
## Models where empirical H_e is outside CI for Hotspot_chr24_window_1 :
## character(0)
## TRUE

## Models where empirical H_e is within CI for Hotspot_chr13_window_1 :
## [1] "s=0.4"
## 
## Models where empirical H_e is outside CI for Hotspot_chr13_window_1 :
## character(0)
## FALSE

## Models where empirical H_e is within CI for Hotspot_chr6_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr6_window_1 :
## [1] "s=0.4"
## FALSE

## Models where empirical H_e is within CI for Hotspot_chr15_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr15_window_1 :
## [1] "s=0.4"
## FALSE

## Models where empirical H_e is within CI for Hotspot_chr11_window_2 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr11_window_2 :
## [1] "s=0.4"
## FALSE

## Models where empirical H_e is within CI for Hotspot_chr11_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr11_window_1 :
## [1] "s=0.4"
## FALSE

## Models where empirical H_e is within CI for Hotspot_chr8_window_1 :
## character(0)
## 
## Models where empirical H_e is outside CI for Hotspot_chr8_window_1 :
## [1] "s=0.4"

5.2 5th percentile of the extreme H_e values